#!/usr/bin/env node

const fs = require('fs');
const path = require('path');
const exec = require('child_process').exec;
const OptionParser = require('brisket/lib/optparse');
const {fgets, getVersion, awk, pad, toUTF16, ensureFileExists} = require('./utils.js');

const file_name = __dirname + '/ucd/Scripts.txt';
const data = [];
var file_version = false;
var runmode;

const han_table = {
	'Chinese': [
		[0x2E80, 0x2EFF, 'CJK Radicals Supplement'],
		[0x2F00, 0x2FDF, 'Kangxi Radicals'],
		[0x3100, 0x312F, 'Bopomofo'],
		[0x31A0, 0x31BF, 'Bopomofo Extended'],
		[0x3400, 0x4DBF, 'CJK Unified Ideographs Extension A'],
		[0x4E00, 0x9FCF, 'CJK Unified Ideographs (Han)'],
		[0xF900, 0xFAFF, 'CJK Compatibility Ideographs'],
		[0x20000, 0x2A6DF, 'CJK Unified Ideographs Extension B'],
		[0x2A700, 0x2B73F, 'CJK Unified Ideographs Extension C'],
		[0x2B740, 0x2B81F, 'CJK Unified Ideographs Extension D'],
		[0x2B820, 0x2CEAF, 'CJK Unified Ideographs Extension E'],
		[0x2F800, 0x2FA1F, 'CJK Compatibility Ideographs Supplement']
	],
	'Japanese': [
		[0x3040, 0x309F, 'Hiragana'],
		[0x30A0, 0x30FF, 'Katakana'],
		[0x3190, 0x319F, 'Kanbun'],
		[0x31F0, 0x31FF, 'Katakana Phonetic Extensions'],
		[0xFF65, 0xFF9F, 'Halfwidth Katakana'],
		[0x1B000, 0x1B0FF, 'Kana Supplement']
	],
	'Korean': [
		[0x1100, 0x11FF, 'Hangul Jamo'],
		[0x3130, 0x318F, 'Hangul Compatibility Jamo'],
		[0xA960, 0xA97F, 'Hangul Jamo Extended-A'],
		[0xAC00, 0xD7AF, 'Hangul Syllables'],
		[0xD7B0, 0xD7FF, 'Hangul Jamo Extended-B'],
		[0xFFA0, 0xFFDC, 'Halfwidth Jamo']
	],
	'Yi': [
		[0xA000, 0xA48F, 'Yi Syllables'],
		[0xA490, 0xA4CF, 'Yi Radicals']
	]
};

function setup () {
	ensureFileExists(file_name);

	var content = fs.readFileSync(file_name, 'utf8').split('\n');
	var count = 0;
	for (var i = 0, goal = content.length; i < goal; i++) {
		var line = content[i].replace(/\s+$/, '');
		var re;
		if (line == '') continue;
		if (/^\s*#/.test(line)) {
			if (file_version === false && (re = /Scripts-([0-9.]+)\.txt/.exec(line))) {
				file_version = re[1];
			}
			continue;
		}

		re = /^([0-9A-F]+)(\.\.([0-9A-F]+))?\s*;\s*([^ #]+)\s*#\s*(..)/.exec(line);
		if (re) {
			var low = parseInt(re[1], 16);
			var high = re[2] == undefined ? low : parseInt(re[3], 16);
			var script_name = re[4];
			var prop_val = re[5];

			if (low >= 0x10000) continue;
		}

		for (var j = low; j <= high; j++) {
			data[j] = [script_name, prop_val];
		}

		count++;
	}
}

function output_han_table () {
	var littleRanges = [];
	var bigRanges = [];
	var count = 0;

	for (var script in han_table) {
		var langOutput = false;
		han_table[script].forEach(d => {
			if (d[0] >= 0x10000) {
				bigRanges.push(`\t/* ${script}, ${d[2]} (U+${pad(d[0])} - U+${pad(d[1])}) */`);
				bigRanges.push(`\t'${getCharacterClassFromSMP(d[0], d[1])}',`);
			}
			else {
				if (!langOutput) {
					if (count) {
						littleRanges.push('');
					}
					littleRanges.push(`\t/* ${script} */`);
					langOutput = true;
				}
				littleRanges.push(`\t'\\\\u${pad(d[0])}-\\\\u${pad(d[1])}',\t\t// ${d[2]}`);
			}
		});
		count++;
	}

	littleRanges[littleRanges.length - 1] = littleRanges[littleRanges.length - 1]
		.replace(/,/, '');
	bigRanges[bigRanges.length - 1] = bigRanges[bigRanges.length - 1]
		.replace(/,/, '');

	console.log(`// Ideographic letters in Scripts.txt of Unicode ${file_version}`);
	console.log(`// generated by "src/uncode-tools/${path.basename(__filename)} --han"`);
	console.log(`const REGEX_HAN_FAMILY = new RegExp('[' + [`);
	console.log(littleRanges.join('\n'));
	console.log(`].join('') + ']|' + [`);
	console.log(bigRanges.join('\n'));
	console.log(`].join('|'));`);
	console.log('');
}

function output_nonletters (include, exclude, var_name) {
	var nonletters = [];
	var last = -1;
	data.forEach((d, cp) => {
		if (include && include.test(d[1])) return;
		if (exclude && exclude.test(d[1])) return;

		if (last < 0) {
			nonletters.push([d[0], d[1], cp, cp]);
			last = nonletters.length - 1;
		}
		else {
			if (cp == nonletters[last][3] + 1) {
				nonletters[last][3] = cp;
			}
			else {
				nonletters.push([d[0], d[1], cp, cp]);
				last = nonletters.length - 1;
			}
		}
	});

	var codes = [];
	if (include) {
		codes.push(`// ${include.source} of General Category in Scripts.txt of Unicode ${file_version}`);
	}
	else if (exclude) {
		codes.push(`// not(${exclude}) of General Category in Scripts.txt of Unicode ${file_version}`);
	}
	codes.push(`// generated by "src/unicode-tools/${path.basename(__filename)} --nonletters"`);
	codes.push(`const ${var_name} = new RegExp('[\\`);
	var code = '';
	nonletters.forEach(d => {
		switch (d[3] - d[2]) {
		case 0:
			code += '\\\\u' + pad(d[2]);
			break;
		case 1:
			code += '\\\\u' + pad(d[2]) +
					'\\\\u' + pad(d[3]);
			break;
		default:
			code += '\\\\u' + pad(d[2]) + '-' +
					'\\\\u' + pad(d[3]);
		}
		if (code.length >= 90) {
			codes.push(code + '\\');
			code = '';
		}
	});
	if (code != '') {
		codes.push(code + '\\');
	}
	codes.push("]');");
	console.log(codes.join('\n'));
	console.log('');
}

function output_scripts () {
	var scripts = [];
	var script_ids = {};
	var last = -1;
	data.forEach((d, cp) => {
		if (last < 0) {
			scripts.push([d[0], d[1], cp, cp]);
			last = scripts.length - 1;
		}
		else {
			if (d[0] == scripts[last][0]) {
				scripts[last][3] = cp;
			}
			else {
				scripts.push([d[0], d[1], cp, cp]);
				last = scripts.length - 1;
			}
		}
		if (!(d[0] in script_ids)) {
			script_ids[d[0]] = Object.keys(script_ids).length;
		}
	});

	var codes = [];
	codes.push(`// Scripts of Unicode ${file_version}`);
	codes.push(`const SCRIPT_TABLE = {\\`);
	scripts.forEach(d => {
		if (d[0] == 'Common') return;
		codes.push(
			'    ' +
			'0x' + pad(d[2]) + ', ' +
			'0x' + pad(d[3]) + ', ' +
			pad(script_ids[d[0]], ' ', 3) + ', ' +
			'// ' + d[0]);
	});
	codes[codes.length - 1] = codes[codes.length - 1].replace(/,( \/\/)/, ' $1');
	codes.push('};');
	console.log(codes.join('\n'));
	console.log('');
	console.log('');
}

function getCharacterClassFromSMP (from, to) {
	var data = [/*high-start, high-end, low-start, low-end*/];
	for (var i = from; i <= to; i++) {
		var pair = toUTF16(i);
		if (data.length && data[data.length - 1][0] == pair[0]) {
			data[data.length - 1][3] = pair[1];
		}
		else {
			data.push([pair[0], pair[0], pair[1], pair[1]]);
		}
	}

	for (var i = 0; i < data.length - 1; i++) {
		if (data[i][1] + 1 == data[i + 1][0]
		&&  data[i][2] == data[i + 1][2]
		&&  data[i][3] == data[i + 1][3]) {
			data[i][1] = data[i + 1][0];
			data.splice(i + 1, 1);
			i--;
		}
	}

	var result = [];

	data.forEach(d => {
		var highFirst = d[0];
		var highLast = d[1];
		var lowFirst = d[2];
		var lowLast = d[3];
		if (highFirst != highLast) {
			if (lowFirst != lowLast) {
				result.push(
					`[\\\\u${pad(highFirst)}-\\\\u${pad(highLast)}]` +
					`[\\\\u${pad(lowFirst)}-\\\\u${pad(lowLast)}]`
				);
			}
			else {
				result.push(
					`[\\\\u${pad(highFirst)}-\\\\u${pad(highLast)}]` +
					`\\\\u${pad(lowFirst)}`
				);
			}
		}
		else {
			if (lowFirst != lowLast) {
				result.push(
					`\\\\u${pad(highFirst)}[\\\\u${pad(lowFirst)}-\\\\u${pad(lowLast)}]`
				);
			}
			else {
				result.push(
					`\\\\u${pad(highFirst)}\\\\u${pad(lowFirst)}`
				);
			}
		}
	});

	return result.join('|');
}

function printHelp () {
	console.log('usage: --han           Generate CJK Ideograph definition code');
	console.log('       --nonletters    Generate non-letters definition code');
	process.exit(1);
}

(new OptionParser)
	.on('--han          Generate CJK Ideograph definition code', v => {
		runmode = 'han';
	})
	.on('--nonletters   Generate non-letters definition code', v => {
		runmode = 'nonletters';
	})
	.parse(process.argv);

getVersion()
.then(() => {
	switch (runmode) {
	case 'han':
		setup();
		output_han_table();
		break;
	case 'nonletters':
		setup();
		output_nonletters(null, /[ZLN]./, 'REGEX_NON_LETTER');
		break;
	default:
		printHelp();
		break;
	}
})
.catch(error => console.error(error));
